# Check and Weight Govt Spending Revision et al Lucid Data
# Created: 01.03.2019

##########################################################
# This program does the following:
#
# 1. Imports GenPop Datasets (Soft Launch & Full Launch)
# 2. Imports HHI Targeted Datasets (Launch 1, 2, 3)
# 3. GenPop:
#    A. Append Datasets
# 4. Targeted:
#    A. Merge Lucid Passthrough Demo Info with Launch 1 Data
#    B. Append Datasets
# 5. Change education and political_party variable naming/coding across GenPop & Targeted Datasets
# 6. Append GenPop & Targeted Datasets
# 7. Check for and remove duplicate RIDs (respondents)  
# 8. Imports ACS Data, re-codes to match Lucid survey demo vars, calculates pop marginals with individual-level wts
# 9. Re-codes Lucid demo vars, fills in missing values
# 10. Rakes to create weights
# 11. Trims and checks weights
# 12. Subsets data by module

##########################################################

library(tidyr)
library(dplyr)
library(Hmisc)
library(ipumsr)
library(survey)

rm(list = ls())

setwd("PATH TO DATA FOLDER")

########
# GenPop 
########

# Append and remove duplicate RIDs

gp1 <- read.csv("AH_POBE_Lucid_GenPop_1.csv", as.is = TRUE)
gp2 <- read.csv("AH_POBE_Lucid_GenPop_2.csv", as.is = TRUE)

gpf <- bind_rows(gp1, gp2, .id = "wave")

# Drop respondents that were not sucessfully randomized 

gpf <- gpf[gpf$isRandomized == 1, ]

# Check number of complete observations

nrow(gpf[gpf$complete == 1, ])

##############
# HHI Targeted 
##############

tar1 <- read.csv("AH_POBE_Lucid_Target_1.csv", as.is = TRUE)
tar1 <- select(tar1, -c(education, ethnicity, gender, hhi, hispanic, political_party, region, zip))

tar1dem <- read.csv("AH_POBE_Lucid_Target_1_demos.csv", as.is = T)
tar1dem$rid <- tolower(tar1dem$rid)
head(tar1dem$rid)

tar1_merged <- left_join(tar1, tar1dem, by = "rid")

tar2 <- read.csv("AH_POBE_Lucid_Target_2.csv", as.is = TRUE)
tar3 <- read.csv("AH_POBE_Lucid_Target_3.csv", as.is = TRUE)

# Append Data

tarf <- bind_rows(tar1_merged, tar2, tar3, .id = "wave") 

# Remove Lucid Test Completes

tarf <- tarf[tarf$rid != "5b2693a4-f822-49ed-9520-d57bfcf42965", ]

# Drop respondents that were not sucessfully randomized 

tarf <- tarf[tarf$isRandomized == 1, ]

# Drop respodents that were randomized, but did not fall within the $55-100K band
# measured in the pre-survey and were terminated

table(tarf$ps3, useNA = "always")

tarf <- tarf[!is.na(tarf$ps3) & tarf$ps3 >= 11 & tarf$ps3 <= 19, ]

# Check number of complete observations

nrow(tarf[tarf$complete == 1, ])

#####################################################
# Rename Education and Political Party Var
# The codes are different between GenPop and Targeted
#####################################################

lucid_demos <- c("gender", "hispanic", "ethnicity", "education", "hhi", "region", "political_party")

for (i in 1:length(lucid_demos)) {
  print(lucid_demos[i])
  print(table(gpf[ ,lucid_demos[i]]))
  print(table(tarf[ ,lucid_demos[i]]))
}

table(gpf$education)
table(gpf$political_party)
gpf <- rename(gpf, education_short = education, political_party_long = political_party)

table(tarf$education)
table(tarf$political_party)
tarf <- rename(tarf, education_long = education, political_party_short = political_party)

############################
# Append GenPop and Targeted
# Remove Duplicates
############################

full0 <- bind_rows(gpf, tarf, .id = "sample")

full0$start <- as.POSIXct(full0$V8, format = "%Y-%m-%d %H:%M:%S")
full0$end <- as.POSIXct(full0$V9, format = "%Y-%m-%d %H:%M:%S")

dups0 <- full0 %>%
         group_by(rid) %>%
         summarise(times = n()) %>%
         filter(times > 1)

dups1 <- left_join(dups0, full0) %>%
         select(rid, start, end) %>%
         arrange(start, end)

duplicated(dups1$rid)

dups2 <- dups1[duplicated(dups1$rid), ]

full1 <- anti_join(full0, dups2)

###############################################
# Create Weights by Raking Using INDIVIDUAL WTS
###############################################

# Get population marginals from CPS

y <- read_ipums_micro(ddi = "usa_00003.xml", 
                      data_file = "usa_00003.dat", 
                      verbose = T)
head(y)

ipums_view(y)

names(y)

table(y$SEX, useNA = "always")
table(y$AGE, useNA = "always")
table(y$RACE, useNA = "always")
table(y$HISPAN, useNA = "always")
table(y$EDUCD, useNA = "always")
summary(y$HHINCOME, useNA = "always")

y <- y[y$AGE >= 18, ]

table(y$EDUCD, useNA = "always")
summary(y$HHINCOME, useNA = "always")
sum(y$HHINCOME == 9999999)/nrow(y)

y <- y[y$HHINCOME != 9999999, ]
summary(y$HHINCOME, useNA = "always")

# Users should also be sure to select one person (e.g., PERNUM = 1) to represent the entire household.
# In previous HHWT version, I do subset data to PERNUM = 1 -- not appropriate when using PERWT 

# Create HHI buckets to match survey question ps3

y$hhi <- NA
y$hhi[y$HHINCOME < 10000] <- 1
y$hhi[y$HHINCOME >= 10000 & y$HHINCOME < 15000] <- 2
y$hhi[y$HHINCOME >= 15000 & y$HHINCOME < 20000] <- 3
y$hhi[y$HHINCOME >= 20000 & y$HHINCOME < 25000] <- 4
y$hhi[y$HHINCOME >= 25000 & y$HHINCOME < 30000] <- 5
y$hhi[y$HHINCOME >= 30000 & y$HHINCOME < 35000] <- 6
y$hhi[y$HHINCOME >= 35000 & y$HHINCOME < 40000] <- 7
y$hhi[y$HHINCOME >= 40000 & y$HHINCOME < 45000] <- 8
y$hhi[y$HHINCOME >= 45000 & y$HHINCOME < 50000] <- 9
y$hhi[y$HHINCOME >= 50000 & y$HHINCOME < 55000] <- 10
y$hhi[y$HHINCOME >= 55000 & y$HHINCOME < 60000] <- 11
y$hhi[y$HHINCOME >= 60000 & y$HHINCOME < 65000] <- 12
y$hhi[y$HHINCOME >= 65000 & y$HHINCOME < 70000] <- 13
y$hhi[y$HHINCOME >= 70000 & y$HHINCOME < 75000] <- 14
y$hhi[y$HHINCOME >= 75000 & y$HHINCOME < 80000] <- 15
y$hhi[y$HHINCOME >= 80000 & y$HHINCOME < 85000] <- 16
y$hhi[y$HHINCOME >= 85000 & y$HHINCOME < 90000] <- 17
y$hhi[y$HHINCOME >= 90000 & y$HHINCOME < 95000] <- 18
y$hhi[y$HHINCOME >= 95000 & y$HHINCOME < 100000] <- 19
y$hhi[y$HHINCOME >= 100000 & y$HHINCOME < 125000] <- 20
y$hhi[y$HHINCOME >= 125000 & y$HHINCOME < 150000] <- 21
y$hhi[y$HHINCOME >= 150000 & y$HHINCOME < 175000] <- 22
y$hhi[y$HHINCOME >= 175000 & y$HHINCOME < 200000] <- 23
y$hhi[y$HHINCOME >= 200000 & y$HHINCOME < 250000] <- 24
y$hhi[y$HHINCOME >= 250000] <- 25

table(y$hhi, useNA = "always")

# Create age group buckets

class(y$AGE)
summary(y$AGE)
table(y$AGE, useNA = "always")

y$agegrp <- NA
y$agegrp[y$AGE >= 18 & y$AGE < 25] <- 1
y$agegrp[y$AGE >= 25 & y$AGE < 30] <- 2
y$agegrp[y$AGE >= 30 & y$AGE < 50] <- 3
y$agegrp[y$AGE >= 50 & y$AGE < 70] <- 4
y$agegrp[y$AGE >= 70] <- 5

sum(is.na(y$agegrp))

# Create race/ethnicity dummies

table(race = y$RACE, hisp = y$HISPAN, useNA = "always")

y$black <- 0
y$black[y$RACE == 2] <- 1
table(y$RACE, y$black, useNA = "always")

y$latino <- 0
y$latino[y$HISPAN > 0] <- 1
table(y$HISPAN, y$latino, useNA = "always")

# Create collapsed education variable

table(y$EDUCD, useNA = "always")

y$edu <- NA
y$edu[y$EDUCD >= 2 & y$EDUCD <= 61] <- 1
y$edu[y$EDUCD >= 62 & y$EDUCD <= 64] <- 2
y$edu[y$EDUCD >= 65 & y$EDUCD <= 90] <- 3
y$edu[y$EDUCD >= 100 & y$EDUCD <= 113] <- 4
y$edu[y$EDUCD >= 114 & y$EDUCD <= 116] <- 5

table(y$EDUCD, y$edu, useNA = "always")

# Create female dummy

y$female <- 0
y$female[y$SEX == 2] <- 1
table(y$female, y$SEX, useNA = "always")

# CALCULATE POPULATION MARGINALS

# Age Group X Education

y$agegrp_x_edu <- paste(y$agegrp, "-", y$edu, sep = "")
head(y[ ,c("agegrp", "edu", "agegrp_x_edu")])

agegrp_x_edu_marg <- wtd.table(y$agegrp_x_edu, weights = y$PERWT)

agegrp_x_edu_marg$prop <- 
  agegrp_x_edu_marg$sum.of.weights/sum(agegrp_x_edu_marg$sum.of.weights)

agegrp_x_edu_marg
sum(agegrp_x_edu_marg$prop)

# HHI

hhi_marg <- wtd.table(y$hhi, weights = y$PERWT)
hhi_marg$prop <- hhi_marg$sum.of.weights/sum(hhi_marg$sum.of.weights)

hhi_marg
sum(hhi_marg$prop)

# Gender (female = 1)

female_marg <- wtd.table(y$female, weights = y$PERWT)
female_marg$prop <- female_marg$sum.of.weights/sum(female_marg$sum.of.weights)

female_marg
sum(female_marg$prop)

wtd.mean(y$female, y$PERWT)

# Black

black_marg <- wtd.table(y$black, weights = y$PERWT)
black_marg$prop <- black_marg$sum.of.weights/sum(black_marg$sum.of.weights)

black_marg
sum(black_marg$prop)

# Latino

latino_marg <- wtd.table(y$latino, weights = y$PERWT)
latino_marg$prop <- latino_marg$sum.of.weights/sum(latino_marg$sum.of.weights) 

latino_marg
sum(latino_marg$prop)

# CREATE CORRESPONDING DEMO INDICATORS IN SAMPLE

# Age Group X Education

table(full1$ps6, useNA = "always")

full1$age <- 2018 - (full1$ps6 + 1919)
table(full1$age, useNA = "always")

# Drop observations with stated age < 18

full2 <- full1[is.na(full1$age) | full1$age >= 18, ]
table(full2$age, useNA = "always")

# Fill in missing values

full2$age_r <- full2$age

set.seed(1456)
full2$age_r[is.na(full2$age_r)] <- sample(full2$age_r[!is.na(full2$age_r)], sum(is.na(full2$age_r)),
                                          replace = TRUE)
table(full2$age_r, is.na(full2$age), useNA = "always")

# Create agegrp variable

full2$agegrp_r <- NA
full2$agegrp_r[full2$age_r >= 18 & full2$age_r < 25] <- 1
full2$agegrp_r[full2$age_r >= 25 & full2$age_r < 30] <- 2
full2$agegrp_r[full2$age_r >= 30 & full2$age_r < 50] <- 3
full2$agegrp_r[full2$age_r >= 50 & full2$age_r < 70] <- 4
full2$agegrp_r[full2$age_r >= 70] <- 5

table(full2$age_r, full2$agegrp_r, useNA = "always")

# Create unified education variable

table(full2$sample, full2$education_long, useNA = "always")
table(full2$sample, full2$education_short, useNA = "always")

full2$edu <- NA
full2$edu[(full2$education_long <= 3) | (full2$education_short == 1)] <- 1
full2$edu[(full2$education_long == 4) | (full2$education_short == 2)] <- 2
full2$edu[(full2$education_long %in% c(5,6,7)) | (full2$education_short %in% c(3,4,5))] <- 3
full2$edu[(full2$education_long == 8) | (full2$education_short == 6)] <- 4
full2$edu[(full2$education_long %in% c(9,10,11)) | (full2$education_short %in% c(7,8))] <- 5

table(full2$edu, useNA = "always")
table(full2$edu, full2$education_long, useNA = "always")
table(full2$edu, full2$education_short, useNA = "always")

# Fill in missing values

full2$edu_r <- full2$edu

set.seed(1953)
full2$edu_r[is.na(full2$edu_r)] <- sample(full2$edu_r[!is.na(full2$edu_r)], sum(is.na(full2$edu_r)),
                                          replace = TRUE)

table(full2$edu_r, full2$edu, useNA = "always")

# Create combined agegroup-edu indicator

full2$agegrp_x_edu_r <- paste(full2$agegrp_r, "-", full2$edu_r, sep = "")
table(full2$agegrp_x_edu_r, useNA = "always")

# HHI
# Change Lucid passthrough hhi var name to avoid confusion

full2 <- rename(full2, hhi_lucid = hhi)

table(full2$ps3, useNA = "always")

full2$hhi_r <- full2$ps3

set.seed(4596)
full2$hhi_r[is.na(full2$hhi_r)] <- sample(full2$hhi_r[!is.na(full2$hhi_r)], sum(is.na(full2$hhi_r)),
                                          replace = TRUE)
table(full2$hhi_r, full2$ps3, useNA = "always")

# Gender

table(full2$gender, useNA = "always")

# No missing values

full2$female <- 0
full2$female[full2$gender == 2] <- 1

table(full2$female, full2$gender, useNA = "always")

# Black and Latino 

table(full2$ethnicity, useNA = "always")

# ethnicity = 16 (prefer not to answer); should be treated as missing

# Create black dummy vars

full2$black <- NA
full2$black[!(full2$ethnicity %in% c(2,16))] <- 0
full2$black[full2$ethnicity == 2] <- 1
table(full2$black, full2$ethnicity, useNA = "always")

full2$black_r <- full2$black

set.seed(3859)
full2$black_r[is.na(full2$black_r)] <- sample(full2$black_r[!is.na(full2$black_r)], 
                                              sum(is.na(full2$black_r)), replace = TRUE)
table(full2$black_r, full2$black, useNA = "always")

# Create Latino dummy vars

table(full2$hispanic, useNA = "always")

# hispanic = 15 (prefer not to answer); should be treated as missing

full2$latino <- NA
full2$latino[full2$hispanic == 1] <- 0
full2$latino[full2$hispanic >= 2 & full2$hispanic <= 14] <- 1
table(full2$latino, full2$hispanic, useNA = "always")

full2$latino_r <- full2$latino

set.seed(4950)
full2$latino_r[is.na(full2$latino_r)] <- sample(full2$latino_r[!is.na(full2$latino_r)],
                                                sum(is.na(full2$latino_r)), replace = TRUE)

table(full2$latino_r, full2$latino, useNA = "always")

# Subset sample to complete responses before raking

full3 <- full2[full2$complete == 1, ]

# CREATE RAKE WEIGHTS

the.vars <- c("agegrp_x_edu_r", "hhi_r", "female", "black_r", "latino_r")

poptargs <- list(
  data.frame(agegrp_x_edu_r = agegrp_x_edu_marg$x, Freq = nrow(full3)*agegrp_x_edu_marg$prop),
  data.frame(hhi_r = hhi_marg$x, Freq = nrow(full3)*hhi_marg$prop),
  data.frame(female = female_marg$x, Freq = nrow(full3)*female_marg$prop),
  data.frame(black_r = black_marg$x, Freq = nrow(full3)*black_marg$prop),
  data.frame(latino_r = latino_marg$x, Freq = nrow(full3)*latino_marg$prop)
)

poptargs

dt.svy <- svydesign(ids = ~1, data = full3, weights = NULL)

dt.svy.rk <- rake(dt.svy, 
                  sample.margins = list(~agegrp_x_edu_r, ~hhi_r, ~female, ~black_r, ~latino_r),
                  population.margins = poptargs)

cat("Summary of distribution of weights before trimming:\n")
the.wts <- weights(dt.svy.rk)
print(summary(the.wts))
print(quantile(the.wts,probs=seq(0,1,.05)))
cat("Observations with max and min weights assigned. \nMax:\n")
print(full3[the.wts == max(the.wts),the.vars])
cat("Min:\n")
print(full3[the.wts == min(the.wts),the.vars])

# Trim the weights? (Weights are constrained bewteen 1/8 and 8, total trim is distributed across
# untrimmed observations)

if (max(the.wts) > 8) {
  
  dt.svy.rk2 <- trimWeights(dt.svy.rk,lower=1/8,upper=8,strict=T)
  cat("Summary of distribution of weights after trimming:\n")
  print(summary(the.wts <- weights(dt.svy.rk2)))
  cat("Observations with max and min weights aDTgned.\nMax:\n")
  print(full3[the.wts == max(the.wts),the.vars])
  cat("Min:\n")
  print(full3[the.wts == min(the.wts),the.vars])
  
} else {
  dt.svy.rk2 <- dt.svy.rk
}

# Run some checks on rake weights

summary(the.wts)

head(full3$V1)
sum(duplicated(full3$V1))
wts.for.merge <- data.frame(V1 = dt.svy.rk2$variables[,"V1"], rwt = the.wts,
                            stringsAsFactors = FALSE)

head(wts.for.merge)
class(wts.for.merge$V1)

# Merge weights back onto complete data

x <- left_join(full3, wts.for.merge)
names(x)

head(wts.for.merge)
head(x[ ,c("V1","rwt")])

# Compare Sample and Population margins

sum(x$rwt)

comp_margins <- function(sv, mar) {

post <- wtd.table(x[ ,sv], weights = x$rwt)
post$prop <- post$sum.of.weights/sum(post$sum.of.weights)

print(mar[["prop"]] - post$prop)
print(max(abs((mar[["prop"]] - post$prop))))
}

comp_margins("agegrp_x_edu_r", agegrp_x_edu_marg)
comp_margins("female", female_marg)
comp_margins("hhi_r", hhi_marg)
comp_margins("black_r", black_marg)
comp_margins("latino_r", latino_marg)

# Output data for GS Analysis

save(full2, x, 
     file = "Govt_Spend_Rev_Appended_IND_WTS.RData")
